In [4]:
import pickle
fileObject = open("./data/preprocessing",'r')
df_tablet = pickle.load(fileObject)
In [5]:
df_X = df_tablet[[
'word', 'sentence', 'flesch reading', 'word/sentence', 'syllable', 'display', 'sound', 'os', 'security', 'hardware', 'battery', 'bug', 'price', 'cs', 'wifi', 'accesory', 'app', 'compatible', 'depth', 'depth/word', 'usable', 'total topic', 'redundancy', 'redundancy/sentence', 'rank', 'pos topic', 'neg topic', 'density'
]]
df_y = df_tablet[['help class']]
In [6]:
from sklearn.cross_validation import train_test_split
X = df_X.as_matrix(columns=None)
y = np.ravel(df_y.values)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=33)
=================================================================================================================
In [9]:
# baseline confirmation, implying that model has to perform at least as good as it
from sklearn.dummy import DummyClassifier
clf_Dummy = DummyClassifier(strategy='most_frequent')
clf_Dummy = clf_Dummy.fit(X_train, y_train)
print('baseline score =>', round(clf_Dummy.score(X_test, y_test), 2))
===================================================================================================================
(1) n_estimators
In [506]:
from sklearn.metrics import recall_score
from sklearn.ensemble import RandomForestClassifier
from matplotlib.pyplot import axvline, axhline
recall_range = []
n_estimator_range = []
for i in np.arange(10, 20, 1):
clf_RF = RandomForestClassifier(oob_score=True, n_estimators=i).fit(X_train, y_train)
clf_RF_predicted = clf_RF.predict(X_test)
recall = round(recall_score(clf_RF_predicted, y_test), 2)
n_estimator_range.append(i)
recall_range.append(recall)
dictionary = dict(zip(n_estimator_range, recall_range))
plt.figure(figsize=(10, 3))
plt.plot(n_estimator_range, recall_range, color='#EA5959', label='max recall: %(n)0.2f \n%(s)s: %(v)2d' %
{'n':max(dictionary.values()), 's':'n estimator', 'v':max(dictionary, key=lambda i: dictionary[i])})
plt.scatter([max(dictionary, key=lambda i: dictionary[i]), ], [max(dictionary.values()), ], 80, color='#EA5959')
axhline(max(dictionary.values()), color='#EA5959', linewidth=1, linestyle='--')
axvline(max(dictionary, key=lambda i: dictionary[i]), color='#EA5959', linewidth=1, linestyle='--')
plt.legend(loc='lower right', prop={'size':12})
plt.xlim(min(n_estimator_range), max(n_estimator_range))
plt.ylim(min(recall_range)*0.98, max(recall_range)*1.02)
plt.ylabel('Recall')
plt.xlabel('n estimator');
(2) max features
In [511]:
recall_range = []
max_features_range = []
for i in np.arange(1, 15, 1):
clf_RF = RandomForestClassifier(oob_score=True, n_estimators=18, max_features=i).fit(X_train, y_train)
clf_RF_predicted = clf_RF.predict(X_test)
recall = round(recall_score(clf_RF_predicted, y_test), 2)
max_features_range.append(i)
recall_range.append(recall)
dictionary = dict(zip(max_features_range, recall_range))
plt.figure(figsize=(10, 3))
plt.plot(max_features_range, recall_range, color='#EA5959', label='max recall: %(n)0.2f \n%(s)s: %(v)2d' %
{'n':max(dictionary.values()), 's':'max features', 'v':max(dictionary, key=lambda i: dictionary[i])})
plt.scatter([max(dictionary, key=lambda i: dictionary[i]), ], [max(dictionary.values()), ], 80, color='#EA5959')
axhline(max(dictionary.values()), color='#EA5959', linewidth=1, linestyle='--')
axvline(max(dictionary, key=lambda i: dictionary[i]), color='#EA5959', linewidth=1, linestyle='--')
plt.legend(loc='lower right', prop={'size':12})
plt.xlim(min(max_features_range), max(max_features_range))
plt.ylim(min(recall_range)*0.98, max(recall_range)*1.02)
plt.ylabel('Recall')
plt.xlabel('max features');
(3) min sample leaf
In [513]:
recall_range = []
min_samples_leaf_range = []
for i in np.arange(1, 20, 1):
clf_RF = RandomForestClassifier(oob_score=True, n_estimators=18, max_features=14, min_samples_leaf=i).fit(X_train, y_train)
clf_RF_predicted = clf_RF.predict(X_test)
recall = round(recall_score(clf_RF_predicted, y_test), 2)
min_samples_leaf_range.append(i)
recall_range.append(recall)
dictionary = dict(zip(min_samples_leaf_range, recall_range))
plt.figure(figsize=(10, 3))
plt.plot(min_samples_leaf_range, recall_range, color='#EA5959', label='max recall: %(n)0.2f \n%(s)s: %(v)2d' %
{'n':max(dictionary.values()), 's':'min samples leaf', 'v':max(dictionary, key=lambda i: dictionary[i])})
plt.scatter([max(dictionary, key=lambda i: dictionary[i]), ], [max(dictionary.values()), ], 80, color='#EA5959')
axhline(max(dictionary.values()), color='#EA5959', linewidth=1, linestyle='--')
axvline(max(dictionary, key=lambda i: dictionary[i]), color='#EA5959', linewidth=1, linestyle='--')
plt.legend(loc='lower right', prop={'size':12})
plt.xlim(min(min_samples_leaf_range), max(min_samples_leaf_range))
plt.ylim(min(recall_range)*0.98, max(recall_range)*1.02)
plt.ylabel('Recall')
plt.xlabel('min_samples_leaf_range');
In [32]:
from sklearn.pipeline import Pipeline
pipeline_clf_train = Pipeline(
steps=[
('clf_RF', RandomForestClassifier()),
]
);
In [34]:
from sklearn.grid_search import GridSearchCV
parameters = {
'clf_RF__min_samples_leaf' : np.arange(1, 28, 1),
'clf_RF__max_features' : np.arange(10, 28, 1),
'clf_RF__criterion' :['gini', 'entropy'],
'clf_RF__n_estimators' : [10],
#'clf_RF__oob_score' : ['True]
}
gs_clf = GridSearchCV(pipeline_clf_train, parameters, n_jobs=-1, scoring='recall')
gs_clf = gs_clf.fit(X_train, y_train)
In [35]:
best_parameters, score, _ = max(gs_clf.grid_scores_, key=lambda x: x[1])
for param_name in sorted(parameters.keys()):
print("%s: %r" % (param_name, best_parameters[param_name]))
print('------------------------------')
print('recall score :', score.round(2))
In [531]:
clf_RF = RandomForestClassifier(n_estimators=18, max_features=14, min_samples_leaf=9, oob_score=True).fit(X_train, y_train)
clf_RF_predicted = clf_RF.predict(X_test)
In [532]:
from sklearn.metrics import classification_report, confusion_matrix
target_names = ['not helpful', 'helpful']
print(classification_report(y_test, clf_RF_predicted, target_names=target_names))
In [533]:
plt.figure(figsize=(4,4))
cm = confusion_matrix(y_test, clf_RF_predicted)
print(cm)
target_names = ['not helpful', 'helpful']
plt.grid(False)
plt.imshow(cm, interpolation='nearest', cmap=plt.cm.Blues)
plt.title('Confusion matrix')
plt.colorbar()
tick_marks = np.arange(len(target_names))
plt.xticks(tick_marks, target_names, rotation=45)
plt.yticks(tick_marks, target_names)
plt.tight_layout()
plt.ylabel('True label')
plt.xlabel('Predicted label');
In [ ]:
clf_RF = RandomForestClassifier(n_estimators=10, max_features=19, min_samples_leaf=27, criterion='entropy', oob_score=True).fit(X_train, y_train)
clf_RF_predicted = clf_RF.predict(X_test)
In [338]:
from sklearn.metrics import classification_report, confusion_matrix
target_names = ['not helpful', 'helpful']
print(classification_report(y_test, clf_RF_predicted, target_names=target_names))
In [359]:
plt.figure(figsize=(4, 4))
cm = confusion_matrix(y_test, clf_RF_predicted)
print(cm)
target_names = ['not helpful', 'helpful']
plt.grid(False)
plt.imshow(cm, interpolation='nearest', cmap=plt.cm.Blues)
plt.title('Confusion matrix')
plt.colorbar()
tick_marks = np.arange(len(target_names))
plt.xticks(tick_marks, target_names, rotation=45)
plt.yticks(tick_marks, target_names)
plt.tight_layout()
plt.ylabel('True label')
plt.xlabel('Predicted label');
In [540]:
clf_RF = RandomForestClassifier(n_estimators=18, max_features=14, min_samples_leaf=9, oob_score=True).fit(X_train, y_train)
clf_RF_predicted = clf_RF.predict(X_test)
In [541]:
from sklearn.metrics import roc_curve, auc
fpr_rf, tpr_rf, thresholds_rf = roc_curve(y_test, clf_RF.predict_proba(X_test)[:, 0], pos_label=0)
fpr_base, tpr_base, thresholds_base = roc_curve(y_test,clf_Dummy.predict_proba(X_test)[:, 0], pos_label=1)
plt.figure(figsize=(5, 5))
plt.plot(fpr_rf, tpr_rf, color='#E45A84', linewidth=3, linestyle='-',
label = 'random forest: %(performance)0.2f' % {'performance':auc(fpr_rf, tpr_rf)})
plt.plot(fpr_base, tpr_base, color='#FFACAC', linewidth=2, linestyle='--',
label = 'baseline: %(performance)0.2f' % {'performance':auc(fpr_base, tpr_base)})
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate (Fall-Out)')
plt.ylabel('True Positive Rate (Recall)')
plt.title('ROC (Receiver operating characteristic)', fontdict={'fontsize': 12})
plt.legend(loc="lower right");
In [318]:
clf_RF = RandomForestClassifier(n_estimators=10, max_features=19, min_samples_leaf=27, criterion='entropy', oob_score=True).fit(X_train, y_train)
clf_RF_predicted = clf_RF.predict(X_test)
In [317]:
fpr_rf, tpr_rf, thresholds_rf = roc_curve(y_test, clf_RF.predict_proba(X_test)[:, 0], pos_label=0)
fpr_base, tpr_base, thresholds_base = roc_curve(y_test,clf_Dummy.predict_proba(X_test)[:, 0], pos_label=1)
plt.figure(figsize=(5, 5))
plt.plot(fpr_rf, tpr_rf, color='#E45A84', linewidth=3, linestyle='-',
label = 'random forest: %(performance)0.2f' % {'performance':auc(fpr_rf, tpr_rf)})
plt.plot(fpr_base, tpr_base, color='#FFACAC', linewidth=2, linestyle='--',
label = 'baseline: %(performance)0.2f' % {'performance':auc(fpr_base, tpr_base)})
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate (Fall-Out)')
plt.ylabel('True Positive Rate (Recall)')
plt.title('ROC (Receiver operating characteristic)', fontdict={'fontsize': 12})
plt.legend(loc="lower right");
Greedy Method | Grid Search | |
---|---|---|
TPR | 0.68 | 0.75 |
AUC | 0.66 | 0.68 |